Setup

Load libraries

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(reshape)
## Warning: package 'reshape' was built under R version 4.1.3
## 
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
## 
##     rename
library(ggplot2)
library(GGally)
## Warning: package 'GGally' was built under R version 4.1.3
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(plotly) 
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:reshape':
## 
##     rename
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(mice) # for imputing
## Warning: package 'mice' was built under R version 4.1.3
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(viridis)
## Warning: package 'viridis' was built under R version 4.1.3
## Loading required package: viridisLite
library(hrbrthemes)
## Warning: package 'hrbrthemes' was built under R version 4.1.3
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(purrr)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.1.3
## 
## Attaching package: 'tidyr'
## The following objects are masked from 'package:reshape':
## 
##     expand, smiths
library(gganimate)
## Warning: package 'gganimate' was built under R version 4.1.3

Load data

data <- readxl::read_excel("data.xlsx")
head(data)
## # A tibble: 6 x 14
##   Date  `Rented Bike Cou~`  Hour `Temperature(°~` `Humidity(%)` `Wind speed (m~`
##   <chr>              <dbl> <dbl>            <dbl>         <dbl>            <dbl>
## 1 42747                254     0             -5.2            37              2.2
## 2 42747                204     1             -5.5            38              0.8
## 3 42747                173     2             -6              39              1  
## 4 42747                107     3             -6.2            40              0.9
## 5 42747                 78     4             -6              36              2.3
## 6 42747                100     5             -6.4            37              1.5
## # ... with 8 more variables: `Visibility (10m)` <dbl>,
## #   `Dew point temperature(°C)` <dbl>, `Solar Radiation (MJ/m2)` <dbl>,
## #   `Rainfall(mm)` <dbl>, `Snowfall (cm)` <dbl>, Seasons <chr>, Holiday <chr>,
## #   `Functioning Day` <chr>
# check data type
glimpse(data)
## Rows: 8,760
## Columns: 14
## $ Date                        <chr> "42747", "42747", "42747", "42747", "42747~
## $ `Rented Bike Count`         <dbl> 254, 204, 173, 107, 78, 100, 181, 460, 930~
## $ Hour                        <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ~
## $ `Temperature(°C)`           <dbl> -5.2, -5.5, -6.0, -6.2, -6.0, -6.4, -6.6, ~
## $ `Humidity(%)`               <dbl> 37, 38, 39, 40, 36, 37, 35, 38, 37, 27, 24~
## $ `Wind speed (m/s)`          <dbl> 2.2, 0.8, 1.0, 0.9, 2.3, 1.5, 1.3, 0.9, 1.~
## $ `Visibility (10m)`          <dbl> 2000, 2000, 2000, 2000, 2000, 2000, 2000, ~
## $ `Dew point temperature(°C)` <dbl> -17.6, -17.6, -17.7, -17.6, -18.6, -18.7, ~
## $ `Solar Radiation (MJ/m2)`   <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, ~
## $ `Rainfall(mm)`              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ `Snowfall (cm)`             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ Seasons                     <chr> "Winter", "Winter", "Winter", "Winter", "W~
## $ Holiday                     <chr> "No Holiday", "No Holiday", "No Holiday", ~
## $ `Functioning Day`           <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", ~
# check NaNs
naniar::gg_miss_var(data, show_pct = T)
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

Understand the Data

count(data, Date) # number of unique dates
## # A tibble: 365 x 2
##    Date           n
##    <chr>      <int>
##  1 13/01/2018    24
##  2 13/02/2018    24
##  3 13/03/2018    24
##  4 13/04/2018    24
##  5 13/05/2018    24
##  6 13/06/2018    24
##  7 13/07/2018    24
##  8 13/08/2018    24
##  9 13/09/2018    24
## 10 13/10/2018    24
## # ... with 355 more rows
dim(data)
## [1] 8760   14
dim(data)[1] == 365 * 24
## [1] TRUE

As shown above we have 365 days * 24 hours = 8760 data points. Goal: predict number of bikes rented per hour (given info such as day, rainfall, etc.) figure out which factors affect it most

Clean data

Rename variables so they are easier to understand and work with Original: Date : year-month-day Rented Bike count - Count of bikes rented at each hour Hour - Hour of the day Temperature-Temperature in Celsius Humidity - % Windspeed - m/s Visibility - 10m Dew point temperature - Celsius Solar radiation - MJ/m2 Rainfall - mm Snowfall - cm Seasons - Winter, Spring, Summer, Autumn Holiday - Holiday/No holiday Functional Day - NoFunc(Non Functional Hours), Fun(Functional hours)

Renamed to: date count hour temp humid wind visib dew solar rain snow season holiday func

colnames(data) <- c(
  "date", "count", "hour", "temp", "humid", "wind", "visib",
  "dew", "solar", "rain", "snow", "season", "holiday", "func"
)
attach(data)
## The following object is masked from package:plotly:
## 
##     wind

Dates have mixed formats. Parse for consistency.

# Tried Lubridate but 3456/8470 failed to parse.
# library(lubridate)
# parse_date_time(x = data$Date,
#                 orders = c("y-m-d", "d/m/y"),
#                 locale = "eng")
# dplyr::count(data, Date)

# Looked around and found linelist which has a guess_dates() function
# linelist may not be avail for current version of R
# if that's the case, install via pacman::p_load_gh("reconhub/linelist")
data$date <- linelist::guess_dates(data$date)
count(data, date)
## # A tibble: 365 x 2
##    date           n
##    <date>     <int>
##  1 2017-01-12    24
##  2 2017-02-12    24
##  3 2017-03-12    24
##  4 2017-04-12    24
##  5 2017-05-12    24
##  6 2017-06-12    24
##  7 2017-07-12    24
##  8 2017-08-12    24
##  9 2017-09-12    24
## 10 2017-10-12    24
## # ... with 355 more rows

Now looks good.

Extract day of week (could be a good predictor!)

# Combine Date and Hour. Order Date variables in front
data <- data[, c(1, 3, 2, 4:14)]
data["datetime"] <- as.POSIXct(paste(data$date, data$hour),
  format = "%Y-%m-%d %H", tz = "Asia/Seoul"
)
data <- data[c(15, 1:14)]
weekday <- lubridate::wday(data$date, label = TRUE)
data["weekday"] <- weekday
glimpse(data)
## Rows: 8,760
## Columns: 16
## $ datetime <dttm> 2017-01-12 00:00:00, 2017-01-12 01:00:00, 2017-01-12 02:00:0~
## $ date     <date> 2017-01-12, 2017-01-12, 2017-01-12, 2017-01-12, 2017-01-12, ~
## $ hour     <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,~
## $ count    <dbl> 254, 204, 173, 107, 78, 100, 181, 460, 930, 490, 339, 360, 44~
## $ temp     <dbl> -5.2, -5.5, -6.0, -6.2, -6.0, -6.4, -6.6, -7.4, -7.6, -6.5, -~
## $ humid    <dbl> 37, 38, 39, 40, 36, 37, 35, 38, 37, 27, 24, 21, 23, 25, 26, 3~
## $ wind     <dbl> 2.2, 0.8, 1.0, 0.9, 2.3, 1.5, 1.3, 0.9, 1.1, 0.5, 1.2, 1.3, 1~
## $ visib    <dbl> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 1928, 1~
## $ dew      <dbl> -17.6, -17.6, -17.7, -17.6, -18.6, -18.7, -19.5, -19.3, -19.8~
## $ solar    <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.01, 0.23, 0~
## $ rain     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ snow     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ season   <chr> "Winter", "Winter", "Winter", "Winter", "Winter", "Winter", "~
## $ holiday  <chr> "No Holiday", "No Holiday", "No Holiday", "No Holiday", "No H~
## $ func     <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes"~
## $ weekday  <ord> Thu, Thu, Thu, Thu, Thu, Thu, Thu, Thu, Thu, Thu, Thu, Thu, T~

Assign levels to qualitative variables

# Separate qualitative and quantitative variables
quant_vars <- c("temp", "humid", "wind", "visib", "dew", "solar", "rain", "snow")
qual_vars <- c("season", "weekday", "hour", "holiday", "func")
# change data type to factor
data[qual_vars] <- lapply(data[qual_vars], factor)
glimpse(data)
## Rows: 8,760
## Columns: 16
## $ datetime <dttm> 2017-01-12 00:00:00, 2017-01-12 01:00:00, 2017-01-12 02:00:0~
## $ date     <date> 2017-01-12, 2017-01-12, 2017-01-12, 2017-01-12, 2017-01-12, ~
## $ hour     <fct> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,~
## $ count    <dbl> 254, 204, 173, 107, 78, 100, 181, 460, 930, 490, 339, 360, 44~
## $ temp     <dbl> -5.2, -5.5, -6.0, -6.2, -6.0, -6.4, -6.6, -7.4, -7.6, -6.5, -~
## $ humid    <dbl> 37, 38, 39, 40, 36, 37, 35, 38, 37, 27, 24, 21, 23, 25, 26, 3~
## $ wind     <dbl> 2.2, 0.8, 1.0, 0.9, 2.3, 1.5, 1.3, 0.9, 1.1, 0.5, 1.2, 1.3, 1~
## $ visib    <dbl> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 1928, 1~
## $ dew      <dbl> -17.6, -17.6, -17.7, -17.6, -18.6, -18.7, -19.5, -19.3, -19.8~
## $ solar    <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.01, 0.23, 0~
## $ rain     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ snow     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ season   <fct> Winter, Winter, Winter, Winter, Winter, Winter, Winter, Winte~
## $ holiday  <fct> No Holiday, No Holiday, No Holiday, No Holiday, No Holiday, N~
## $ func     <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Y~
## $ weekday  <ord> Thu, Thu, Thu, Thu, Thu, Thu, Thu, Thu, Thu, Thu, Thu, Thu, T~
# print level-to-int mappings
for (col in data[qual_vars]) {
  df <- data.frame(levels = unique(col), value = as.numeric(unique(col)))
  print(df[order(df$value), ])
}
##   levels value
## 4 Autumn     1
## 2 Spring     2
## 3 Summer     3
## 1 Winter     4
##   levels value
## 2    Sun     1
## 5    Mon     2
## 7    Tue     3
## 3    Wed     4
## 1    Thu     5
## 4    Fri     6
## 6    Sat     7
##    levels value
## 1       0     1
## 2       1     2
## 3       2     3
## 4       3     4
## 5       4     5
## 6       5     6
## 7       6     7
## 8       7     8
## 9       8     9
## 10      9    10
## 11     10    11
## 12     11    12
## 13     12    13
## 14     13    14
## 15     14    15
## 16     15    16
## 17     16    17
## 18     17    18
## 19     18    19
## 20     19    20
## 21     20    21
## 22     21    22
## 23     22    23
## 24     23    24
##       levels value
## 2    Holiday     1
## 1 No Holiday     2
##   levels value
## 2     No     1
## 1    Yes     2
# reorder levels
data$season <- factor(data$season, levels = c("Winter", "Spring", "Summer", "Autumn"))
data$holiday <- factor(data$holiday, levels = c("No Holiday", "Holiday"))
data$weekday <- factor(data$weekday, levels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"))
# check again
for (col in data[qual_vars]) {
  df <- data.frame(levels = unique(col), value = as.numeric(unique(col)))
  print(df[order(df$value), ])
}
##   levels value
## 1 Winter     1
## 2 Spring     2
## 3 Summer     3
## 4 Autumn     4
##   levels value
## 5    Mon     1
## 7    Tue     2
## 3    Wed     3
## 1    Thu     4
## 4    Fri     5
## 6    Sat     6
## 2    Sun     7
##    levels value
## 1       0     1
## 2       1     2
## 3       2     3
## 4       3     4
## 5       4     5
## 6       5     6
## 7       6     7
## 8       7     8
## 9       8     9
## 10      9    10
## 11     10    11
## 12     11    12
## 13     12    13
## 14     13    14
## 15     14    15
## 16     15    16
## 17     16    17
## 18     17    18
## 19     18    19
## 20     19    20
## 21     20    21
## 22     21    22
## 23     22    23
## 24     23    24
##       levels value
## 1 No Holiday     1
## 2    Holiday     2
##   levels value
## 2     No     1
## 1    Yes     2
# summary statistics
summary(data)
##     datetime                        date                 hour     
##  Min.   :2017-01-12 00:00:00   Min.   :2017-01-12   0      : 365  
##  1st Qu.:2018-03-04 05:45:00   1st Qu.:2018-03-04   1      : 365  
##  Median :2018-06-06 11:30:00   Median :2018-06-06   2      : 365  
##  Mean   :2018-06-01 11:30:00   Mean   :2018-06-01   3      : 365  
##  3rd Qu.:2018-09-08 17:15:00   3rd Qu.:2018-09-08   4      : 365  
##  Max.   :2018-12-11 23:00:00   Max.   :2018-12-11   5      : 365  
##                                                     (Other):6570  
##      count             temp            humid            wind      
##  Min.   :   0.0   Min.   :-17.80   Min.   : 0.00   Min.   :0.000  
##  1st Qu.: 191.0   1st Qu.:  3.50   1st Qu.:42.00   1st Qu.:0.900  
##  Median : 504.5   Median : 13.70   Median :57.00   Median :1.500  
##  Mean   : 704.6   Mean   : 12.88   Mean   :58.23   Mean   :1.725  
##  3rd Qu.:1065.2   3rd Qu.: 22.50   3rd Qu.:74.00   3rd Qu.:2.300  
##  Max.   :3556.0   Max.   : 39.40   Max.   :98.00   Max.   :7.400  
##                                                                   
##      visib           dew              solar             rain        
##  Min.   :  27   Min.   :-30.600   Min.   :0.0000   Min.   : 0.0000  
##  1st Qu.: 940   1st Qu.: -4.700   1st Qu.:0.0000   1st Qu.: 0.0000  
##  Median :1698   Median :  5.100   Median :0.0100   Median : 0.0000  
##  Mean   :1437   Mean   :  4.074   Mean   :0.5691   Mean   : 0.1487  
##  3rd Qu.:2000   3rd Qu.: 14.800   3rd Qu.:0.9300   3rd Qu.: 0.0000  
##  Max.   :2000   Max.   : 27.200   Max.   :3.5200   Max.   :35.0000  
##                                                                     
##       snow            season           holiday      func      weekday   
##  Min.   :0.00000   Winter:2160   No Holiday:8328   No : 295   Mon:1200  
##  1st Qu.:0.00000   Spring:2208   Holiday   : 432   Yes:8465   Tue:1272  
##  Median :0.00000   Summer:2208                                Wed:1272  
##  Mean   :0.07507   Autumn:2184                                Thu:1248  
##  3rd Qu.:0.00000                                              Fri:1224  
##  Max.   :8.80000                                              Sat:1248  
##                                                               Sun:1296
str(data)
## tibble [8,760 x 16] (S3: tbl_df/tbl/data.frame)
##  $ datetime: POSIXct[1:8760], format: "2017-01-12 00:00:00" "2017-01-12 01:00:00" ...
##  $ date    : Date[1:8760], format: "2017-01-12" "2017-01-12" ...
##  $ hour    : Factor w/ 24 levels "0","1","2","3",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ count   : num [1:8760] 254 204 173 107 78 100 181 460 930 490 ...
##  $ temp    : num [1:8760] -5.2 -5.5 -6 -6.2 -6 -6.4 -6.6 -7.4 -7.6 -6.5 ...
##  $ humid   : num [1:8760] 37 38 39 40 36 37 35 38 37 27 ...
##  $ wind    : num [1:8760] 2.2 0.8 1 0.9 2.3 1.5 1.3 0.9 1.1 0.5 ...
##  $ visib   : num [1:8760] 2000 2000 2000 2000 2000 ...
##  $ dew     : num [1:8760] -17.6 -17.6 -17.7 -17.6 -18.6 -18.7 -19.5 -19.3 -19.8 -22.4 ...
##  $ solar   : num [1:8760] 0 0 0 0 0 0 0 0 0.01 0.23 ...
##  $ rain    : num [1:8760] 0 0 0 0 0 0 0 0 0 0 ...
##  $ snow    : num [1:8760] 0 0 0 0 0 0 0 0 0 0 ...
##  $ season  : Factor w/ 4 levels "Winter","Spring",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ holiday : Factor w/ 2 levels "No Holiday","Holiday": 1 1 1 1 1 1 1 1 1 1 ...
##  $ func    : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ weekday : Ord.factor w/ 7 levels "Mon"<"Tue"<"Wed"<..: 4 4 4 4 4 4 4 4 4 4 ...

Other approaches:

# # cleanest way to do this if we don't care about the order of the mappings
# out<-data.matrix(data)

# # another way to change string to factor to int (using dplyr)
# dfFactors <- data %>%
#   mutate_if(is.character, as.factor)
# dfFactors <- data %>%
#   mutate_if(is.factor, as.numeric)

# # Eric's approach:

# # Replace values in Holiday column with 0s and 1s. 0 - No Holiday, 1 - Holiday
# data[which(data[,"holiday"]=="Holiday"), "holiday"] <- "1"
# data[which(data[,"holiday"]=="No Holiday"), "holiday"] <- "0"
# data["holiday"] <- as.integer(unlist(data["holiday"]))
# # Replace values in Seasons column with values from 0 to 3. 0 - Winter, 1 - Spring, 2 - Summer, 3 - Autumn
# data[which(data[,"season"]=="Winter"), "season"] <- "0"
# data[which(data[,"season"]=="Spring"), "season"] <- "1"
# data[which(data[,"season"]=="Summer"), "season"] <- "2"
# data[which(data[,"season"]=="Autumn"), "season"] <- "3"
# data["season"] <- as.integer(unlist(data["season"]))
# # Replace values in Functioning Day column with values 0s and 1s. 0 - No, 1 - Yes
# data[which(data[,"func"]=="Yes"), "func"] <- "1"
# data[which(data[,"func"]=="No"), "func"] <- "0"
# data["func"] <- as.integer(unlist(data["func"]))

Time Series Analysis

The dates are very unevenly spaced. Let’s visualize the increments. Let’s also see if there’s any overarching trends over time.

ggplot(data, aes(x = datetime, y = count)) +
  geom_point(cex = 0.1) +
  xlab("")

note/smth to consider: old data should not have as much weight… vanilla regression isn’t the best if we’re using 2017 data why the big jump? anything in the papers she linked?

# get total count per day
df <- aggregate(data$count, by=list(data$date), sum)
colnames(df) <- c("date", "daily_count")
plot(df)

# get a better view of the increments (varying y values are distracting)
plot(df$date, rep(1, length(df$date)), cex = 0.1)

# slice and repeat
seq <- seq.Date(as.Date("2017-10-01"),as.Date("2019-01-01"),by="day")
df1 <- df[df$date %in% seq,]
plot(df1)

plot(df1$date, rep(1, length(df1$date)), cex = 0.1)

# could slice off a little more
seq <- seq.Date(as.Date("2017-12-01"),as.Date("2019-01-01"),by="day")
df1 <- df[df$date %in% seq,]
plot(df1)

plot(df1$date, rep(1, length(df1$date)), cex = 0.1)

# now looks good 
# this cutoff gives us a full year of data - 2017-12-12 to 2018-12-11
summary(df1)
##       date             daily_count   
##  Min.   :2017-12-12   Min.   :    0  
##  1st Qu.:2018-03-13   1st Qu.: 6483  
##  Median :2018-06-12   Median :18518  
##  Mean   :2018-06-11   Mean   :17204  
##  3rd Qu.:2018-09-10   3rd Qu.:26273  
##  Max.   :2018-12-11   Max.   :36149
# slice actual data and save
data <- data[data$date %in% seq,]
write.csv(data,"cleaned.csv", row.names = FALSE)
# let's see how many dates are missing and decide if we want to impute
seq <- seq.Date(min(df1$date),max(df1$date),by="day")
dates <- as.data.frame(seq)
# perform outer join and % missing
merged <- merge(x = df1, y = dates, by.x = "date", by.y = "seq", all = TRUE)
NAs <- merged[is.na(merged$daily_count),]
NAs
##           date daily_count
## 32  2018-01-12          NA
## 63  2018-02-12          NA
## 91  2018-03-12          NA
## 122 2018-04-12          NA
## 152 2018-05-12          NA
## 183 2018-06-12          NA
## 213 2018-07-12          NA
## 244 2018-08-12          NA
## 275 2018-09-12          NA
## 305 2018-10-12          NA
## 336 2018-11-12          NA
# seems like only the 12th of each month are missing
nrow(NAs)/nrow(merged)
## [1] 0.03013699
# we can impute since it is less than 10%, but first double check with a plot
# fill NAs with 0
merged[is.na(merged)] <- 0
merged
##           date daily_count
## 1   2017-12-12        5496
## 2   2017-12-13        6019
## 3   2017-12-14        6398
## 4   2017-12-15        7198
## 5   2017-12-16        4632
## 6   2017-12-17        3776
## 7   2017-12-18        2620
## 8   2017-12-19        4334
## 9   2017-12-20        4568
## 10  2017-12-21        5734
## 11  2017-12-22        7184
## 12  2017-12-23        6624
## 13  2017-12-24        2014
## 14  2017-12-25        3966
## 15  2017-12-26        5605
## 16  2017-12-27        5351
## 17  2017-12-28        6594
## 18  2017-12-29        7663
## 19  2017-12-30        4027
## 20  2017-12-31        3423
## 21  2018-01-01        4290
## 22  2018-01-02        5377
## 23  2018-01-03        5132
## 24  2018-01-04       17388
## 25  2018-01-05       26820
## 26  2018-01-06       31928
## 27  2018-01-07        3231
## 28  2018-01-08       20712
## 29  2018-01-09       26010
## 30  2018-01-10       27909
## 31  2018-01-11       22964
## 32  2018-01-12           0
## 33  2018-01-13        3503
## 34  2018-01-14        4225
## 35  2018-01-15        6334
## 36  2018-01-16        6705
## 37  2018-01-17        6292
## 38  2018-01-18        6383
## 39  2018-01-19        7472
## 40  2018-01-20        5576
## 41  2018-01-21        4973
## 42  2018-01-22        4503
## 43  2018-01-23        3981
## 44  2018-01-24        3154
## 45  2018-01-25        3113
## 46  2018-01-26        2931
## 47  2018-01-27        2693
## 48  2018-01-28        2828
## 49  2018-01-29        4088
## 50  2018-01-30        3360
## 51  2018-01-31        3830
## 52  2018-02-01        6446
## 53  2018-02-02        5954
## 54  2018-02-03        8433
## 55  2018-02-04       21585
## 56  2018-02-05        7448
## 57  2018-02-06       30693
## 58  2018-02-07        5240
## 59  2018-02-08       20693
## 60  2018-02-09       26881
## 61  2018-02-10           0
## 62  2018-02-11       24077
## 63  2018-02-12           0
## 64  2018-02-13        6267
## 65  2018-02-14        6771
## 66  2018-02-15        3484
## 67  2018-02-16        2593
## 68  2018-02-17        3118
## 69  2018-02-18        4562
## 70  2018-02-19        7604
## 71  2018-02-20        7955
## 72  2018-02-21        7491
## 73  2018-02-22        7712
## 74  2018-02-23        5797
## 75  2018-02-24        6356
## 76  2018-02-25        6500
## 77  2018-02-26        9247
## 78  2018-02-27        8862
## 79  2018-02-28        3820
## 80  2018-03-01        6512
## 81  2018-03-02        3290
## 82  2018-03-03       12191
## 83  2018-03-04       21015
## 84  2018-03-05       21027
## 85  2018-03-06       29761
## 86  2018-03-07       29293
## 87  2018-03-08       20794
## 88  2018-03-09       10802
## 89  2018-03-10       30349
## 90  2018-03-11           0
## 91  2018-03-12           0
## 92  2018-03-13       15963
## 93  2018-03-14       19180
## 94  2018-03-15        6227
## 95  2018-03-16       15277
## 96  2018-03-17       15389
## 97  2018-03-18        9609
## 98  2018-03-19        8968
## 99  2018-03-20       11515
## 100 2018-03-21        7384
## 101 2018-03-22       14007
## 102 2018-03-23       14969
## 103 2018-03-24       12243
## 104 2018-03-25       10963
## 105 2018-03-26       13084
## 106 2018-03-27       14710
## 107 2018-03-28       17295
## 108 2018-03-29       17450
## 109 2018-03-30       19301
## 110 2018-03-31       19247
## 111 2018-04-01        6453
## 112 2018-04-02        2487
## 113 2018-04-03        4688
## 114 2018-04-04       19995
## 115 2018-04-05       26670
## 116 2018-04-06       33257
## 117 2018-04-07       31781
## 118 2018-04-08       18363
## 119 2018-04-09       29529
## 120 2018-04-10           0
## 121 2018-04-11       20471
## 122 2018-04-12           0
## 123 2018-04-13       23240
## 124 2018-04-14        6980
## 125 2018-04-15       10207
## 126 2018-04-16       23198
## 127 2018-04-17       22234
## 128 2018-04-18       21670
## 129 2018-04-19       17730
## 130 2018-04-20       22593
## 131 2018-04-21       24404
## 132 2018-04-22        6852
## 133 2018-04-23         977
## 134 2018-04-24       20144
## 135 2018-04-25       26285
## 136 2018-04-26       25670
## 137 2018-04-27       25861
## 138 2018-04-28       24940
## 139 2018-04-29       25349
## 140 2018-04-30       25462
## 141 2018-05-01        6967
## 142 2018-05-02        3932
## 143 2018-05-03        8597
## 144 2018-05-04        2596
## 145 2018-05-05       25353
## 146 2018-05-06       33424
## 147 2018-05-07       28182
## 148 2018-05-08       17415
## 149 2018-05-09       31114
## 150 2018-05-10        4522
## 151 2018-05-11       23472
## 152 2018-05-12           0
## 153 2018-05-13       25913
## 154 2018-05-14       27793
## 155 2018-05-15       28478
## 156 2018-05-16        6416
## 157 2018-05-17        3521
## 158 2018-05-18       16087
## 159 2018-05-19       30994
## 160 2018-05-20       27829
## 161 2018-05-21       31236
## 162 2018-05-22       13790
## 163 2018-05-23       27674
## 164 2018-05-24       28781
## 165 2018-05-25       29057
## 166 2018-05-26       28079
## 167 2018-05-27       28991
## 168 2018-05-28       31020
## 169 2018-05-29       24355
## 170 2018-05-30       30372
## 171 2018-05-31       31681
## 172 2018-06-01        5180
## 173 2018-06-02        3988
## 174 2018-06-03       12003
## 175 2018-06-04       11520
## 176 2018-06-05       14556
## 177 2018-06-06       30498
## 178 2018-06-07       33676
## 179 2018-06-08       19171
## 180 2018-06-09       27838
## 181 2018-06-10       16037
## 182 2018-06-11           0
## 183 2018-06-12           0
## 184 2018-06-13       36149
## 185 2018-06-14       23818
## 186 2018-06-15       33492
## 187 2018-06-16       34360
## 188 2018-06-17       32487
## 189 2018-06-18       27541
## 190 2018-06-19       35349
## 191 2018-06-20       34639
## 192 2018-06-21       34621
## 193 2018-06-22       34079
## 194 2018-06-23       31949
## 195 2018-06-24       27028
## 196 2018-06-25       32702
## 197 2018-06-26        5200
## 198 2018-06-27       29755
## 199 2018-06-28       21647
## 200 2018-06-29       30297
## 201 2018-06-30       20479
## 202 2018-07-01        4714
## 203 2018-07-02        4697
## 204 2018-07-03       11170
## 205 2018-07-04       11159
## 206 2018-07-05       28140
## 207 2018-07-06       32002
## 208 2018-07-07       31076
## 209 2018-07-08       23806
## 210 2018-07-09       30381
## 211 2018-07-10       27614
## 212 2018-07-11       16525
## 213 2018-07-12           0
## 214 2018-07-13       30088
## 215 2018-07-14       25814
## 216 2018-07-15       22790
## 217 2018-07-16       26823
## 218 2018-07-17       28435
## 219 2018-07-18       27896
## 220 2018-07-19       27820
## 221 2018-07-20       26699
## 222 2018-07-21       21283
## 223 2018-07-22       18563
## 224 2018-07-23       24890
## 225 2018-07-24       24635
## 226 2018-07-25       25918
## 227 2018-07-26       24915
## 228 2018-07-27       23275
## 229 2018-07-28       16880
## 230 2018-07-29       18190
## 231 2018-07-30       23695
## 232 2018-07-31       22897
## 233 2018-08-01        5711
## 234 2018-08-02        5613
## 235 2018-08-03        7711
## 236 2018-08-04        6666
## 237 2018-08-05       26429
## 238 2018-08-06       35103
## 239 2018-08-07       30080
## 240 2018-08-08       23880
## 241 2018-08-09       29813
## 242 2018-08-10       29362
## 243 2018-08-11        1721
## 244 2018-08-12           0
## 245 2018-08-13       22827
## 246 2018-08-14       23836
## 247 2018-08-15       18565
## 248 2018-08-16       25329
## 249 2018-08-17       28820
## 250 2018-08-18       26398
## 251 2018-08-19       24547
## 252 2018-08-20       26827
## 253 2018-08-21       25133
## 254 2018-08-22       26072
## 255 2018-08-23       15958
## 256 2018-08-24       11658
## 257 2018-08-25       27302
## 258 2018-08-26       19310
## 259 2018-08-27       14435
## 260 2018-08-28        9092
## 261 2018-08-29        9184
## 262 2018-08-30       20959
## 263 2018-08-31       27817
## 264 2018-09-01        5500
## 265 2018-09-02        6268
## 266 2018-09-03       11642
## 267 2018-09-04       19082
## 268 2018-09-05       28895
## 269 2018-09-06       25911
## 270 2018-09-07        6191
## 271 2018-09-08       21644
## 272 2018-09-09       28354
## 273 2018-09-10           0
## 274 2018-09-11           0
## 275 2018-09-12           0
## 276 2018-09-13       30991
## 277 2018-09-14       28199
## 278 2018-09-15       25079
## 279 2018-09-16       14178
## 280 2018-09-17       30290
## 281 2018-09-18           0
## 282 2018-09-19           0
## 283 2018-09-20       14282
## 284 2018-09-21       18266
## 285 2018-09-22       26398
## 286 2018-09-23       20060
## 287 2018-09-24       17259
## 288 2018-09-25       23350
## 289 2018-09-26       28018
## 290 2018-09-27       30789
## 291 2018-09-28           0
## 292 2018-09-29       31447
## 293 2018-09-30           0
## 294 2018-10-01        4161
## 295 2018-10-02        4425
## 296 2018-10-03       11064
## 297 2018-10-04       16696
## 298 2018-10-05           0
## 299 2018-10-06       29342
## 300 2018-10-07       20641
## 301 2018-10-08       23833
## 302 2018-10-09       30781
## 303 2018-10-10       24410
## 304 2018-10-11       18473
## 305 2018-10-12           0
## 306 2018-10-13       25542
## 307 2018-10-14       24349
## 308 2018-10-15       26198
## 309 2018-10-16       26956
## 310 2018-10-17       26502
## 311 2018-10-18       26504
## 312 2018-10-19       27437
## 313 2018-10-20       25355
## 314 2018-10-21       23906
## 315 2018-10-22       26075
## 316 2018-10-23       21601
## 317 2018-10-24       26418
## 318 2018-10-25       27258
## 319 2018-10-26       13377
## 320 2018-10-27       18398
## 321 2018-10-28       11589
## 322 2018-10-29       19877
## 323 2018-10-30       20822
## 324 2018-10-31       21545
## 325 2018-11-01        4017
## 326 2018-11-02        2850
## 327 2018-11-03       11843
## 328 2018-11-04           0
## 329 2018-11-05       26649
## 330 2018-11-06       24832
## 331 2018-11-07       24135
## 332 2018-11-08       20009
## 333 2018-11-09       31694
## 334 2018-11-10       24526
## 335 2018-11-11       13138
## 336 2018-11-12           0
## 337 2018-11-13       21992
## 338 2018-11-14       22707
## 339 2018-11-15       21502
## 340 2018-11-16       20660
## 341 2018-11-17       15689
## 342 2018-11-18       14319
## 343 2018-11-19       18956
## 344 2018-11-20       19127
## 345 2018-11-21       15732
## 346 2018-11-22       16496
## 347 2018-11-23       16314
## 348 2018-11-24        6477
## 349 2018-11-25       11212
## 350 2018-11-26       17162
## 351 2018-11-27       16282
## 352 2018-11-28       16524
## 353 2018-11-29       16423
## 354 2018-11-30       16297
## 355 2018-12-01        4111
## 356 2018-12-02        4813
## 357 2018-12-03       13339
## 358 2018-12-04       22729
## 359 2018-12-05        3034
## 360 2018-12-06       34544
## 361 2018-12-07       29428
## 362 2018-12-08       17498
## 363 2018-12-09       31809
## 364 2018-12-10       26237
## 365 2018-12-11       21003
p <- merged %>%
  ggplot(aes(x=date, y=daily_count)) +
    geom_line( color="grey") +
    geom_point(shape=21, color="black", fill="#69b3a2", size=0.1) +
    ggtitle("")
# interactive plot
ggplotly(p)
# # impute
# merged <- merge(x = data, y = dates, by.x = "date", by.y = "seq", all = TRUE)
# # use predictive mean matching method
# tempData <- mice(merged,m=1,maxit=20,meth='pmm',seed=500)
# # visualize imputed data (red = imputed)
# # 2 separate plots (grouped by magnitude of x range)
# xyplot(tempData, count ~ temp + humid + dew + visib,
#        pch=18,
#        cex=0.5)
# xyplot(tempData, count ~ wind + solar + rain + snow,
#        pch=18,
#        cex=0.5)
# # quantitative
# densityplot(tempData,
#             count ~ temp + humid + wind + visib + dew + solar + rain + snow,
#             pch=18,
#             cex=0.5)
# # qualitative
# stripplot(tempData,pch=20,cex=0.5)
# # looks good, let's save
# # data <- complete(tempData,1)
# # write.csv(data,"imputed.csv", row.names = FALSE)
data["month"] <- lubridate::month(data$datetime, label = TRUE)
# here you go luke do what you want with it
# visualize response variable
boxplot(data$count)

hist(data$count)

data %>%
  keep(is.numeric) %>% 
  gather() %>% 
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

data[, names(data) != "hour"] %>%
  keep(is.factor) %>% 
  gather() %>% 
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_bar() + 
    coord_flip()
## Warning: attributes are not identical across measure variables;
## they will be dropped

# fix for overlapping labels:
# https://stackoverflow.com/questions/19567986/overlapping-axis-labels-in-r
# scale_x_discrete(guide = guide_axis(n.dodge=3))

Univariate Analysis

Descriptive statistics

# Descriptive statistics + Histograms
library(skimr)
## Warning: package 'skimr' was built under R version 4.1.3
skim(data)
Data summary
Name data
Number of rows 8496
Number of columns 17
_______________________
Column type frequency:
Date 1
factor 6
numeric 9
POSIXct 1
________________________
Group variables None

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
date 0 1 2017-12-12 2018-12-11 2018-06-12 354

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
hour 0 1 FALSE 24 0: 354, 1: 354, 2: 354, 3: 354
season 0 1 FALSE 4 Spr: 2208, Sum: 2208, Aut: 2184, Win: 1896
holiday 0 1 FALSE 2 No : 8064, Hol: 432
func 0 1 FALSE 2 Yes: 8201, No: 295
weekday 0 1 TRUE 7 Tue: 1248, Wed: 1224, Sat: 1224, Sun: 1224
month 0 1 TRUE 12 Dec: 744, Jan: 720, Mar: 720, May: 720

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
count 0 1 716.83 650.32 0.0 192.0 531.00 1082.25 3556.00 ▇▃▂▁▁
temp 0 1 13.31 11.85 -17.8 4.2 14.40 22.70 39.40 ▂▅▆▇▂
humid 0 1 58.30 20.32 0.0 43.0 57.00 74.00 98.00 ▁▅▇▇▅
wind 0 1 1.72 1.03 0.0 0.9 1.50 2.30 7.40 ▇▇▂▁▁
visib 0 1 1436.21 605.57 27.0 937.0 1691.00 2000.00 2000.00 ▂▂▂▂▇
dew 0 1 4.49 12.98 -30.6 -4.0 5.70 15.12 27.20 ▂▃▇▇▆
solar 0 1 0.58 0.88 0.0 0.0 0.01 0.95 3.52 ▇▁▁▁▁
rain 0 1 0.15 1.14 0.0 0.0 0.00 0.00 35.00 ▇▁▁▁▁
snow 0 1 0.07 0.43 0.0 0.0 0.00 0.00 8.80 ▇▁▁▁▁

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
datetime 0 1 2017-12-12 2018-12-11 23:00:00 2018-06-12 11:30:00 8496

Boxplots

# select all numeric columns
dataNumeric <- data[, quant_vars]
meltData <- melt(as.data.frame((dataNumeric)))
## Using  as id variables
p <- ggplot(meltData, aes(factor(variable), value, fill = factor(variable))) 
p + geom_boxplot(width=0.2, color="darkgrey", alpha=0.75) +
  scale_fill_viridis(discrete = TRUE, option = "plasma") + 
  facet_wrap(~variable, scale="free")

for (x in data[qual_vars]){
  p <- ggplot(data, aes(x = x, y = count, fill = x)) +  
  # geom_violin(width=1.2) +
  geom_boxplot(width=0.2, color="darkgrey", alpha=0.5) +
  scale_fill_viridis(discrete = TRUE, option = "mako") 
  print(p)
}

for (x in data[qual_vars]){
  p <- ggplot(data, aes(x = x, y = count, fill = x)) + 
  geom_violin(width=0.8) +
  # geom_boxplot(width=0.2, color="darkgrey", alpha=0.5) +
  scale_fill_viridis(discrete = TRUE, option = "mako")
  print(p)
}

Stripplots

ggplot(data, aes(x = season, y = count)) +
  geom_jitter(position = position_dodge(0.5))

ggplot(data, aes(x = weekday, y = count)) +
  geom_jitter(position = position_dodge(0.5))

ggplot(data, aes(x = hour, y = count)) +
  geom_jitter(position = position_dodge(0.5))

ggplot(data, aes(x = holiday, y = count)) +
  geom_jitter(position = position_dodge(0.5))

ggplot(data, aes(x = func, y = count)) +
  geom_jitter(position = position_dodge(0.5))

Check for Multicollinearity (Multivariate Analysis)

Correlation Plots

library(corrplot)
## Warning: package 'corrplot' was built under R version 4.1.3
## corrplot 0.92 loaded
corrplot.mixed(cor(data[quant_vars]), order = "AOE")

ggpairs(data[quant_vars], 
        lower = list(continuous = wrap("points", alpha = 0.3, size=0.1),
        upper = list(continuous = wrap("density", alpha = 0.5), combo = "box"),
        combo = wrap("dot", alpha = 0.4, size=0.2) ),
        title = "Test")

ggcorr(data[quant_vars], label = TRUE, lable_round = 2, label_alpha = TRUE)
## Warning: Ignoring unknown parameters: lable_round

lowerFn <- function(data, mapping, ...) {
  p <- ggplot(data = data, mapping = mapping) +
    geom_point(color = 'blue', alpha=0.3, size=0.1) +
    geom_smooth(color = 'black', size=1)
  p
}
g <- ggpairs( 
  data = data[quant_vars],
  lower = list(continuous = wrap(lowerFn)),
  diag = list(continuous = wrap("barDiag", colour = "blue"))
)
g + theme(
  axis.text = element_text(size = 6),
  axis.title = element_text(size = 6),
  legend.background = element_rect(fill = "white"),
  panel.grid.major = element_line(colour = NA),
  panel.grid.minor = element_blank(),
  panel.background = element_rect(fill = "grey95")
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

library(ellipse)
## Warning: package 'ellipse' was built under R version 4.1.3
## 
## Attaching package: 'ellipse'
## The following object is masked from 'package:graphics':
## 
##     pairs
library(RColorBrewer)

df <- cor(data[quant_vars])
 
# Build a panel of 100 colors with Rcolor Brewer
my_colors <- rev(brewer.pal(5, "RdYlGn"))
my_colors <- colorRampPalette(my_colors)(100)
 
# Order the correlation matrix
ord <- order(df[1, ])
data_ord <- df[ord, ord]
plotcorr(data_ord , col=my_colors[data_ord*50+50] , mar=c(1,1,1,1))

ggplot(data, aes(x=temp, y=dew) ) +
  geom_bin2d(bins = 70) +
  scale_fill_viridis() +
  theme_bw()

# For fun...
ggplot(data, aes(x=temp, y=dew) ) +
  stat_density_2d(aes(fill = ..level..), geom = "polygon")

# Area + contour
ggplot(data, aes(x=temp, y=dew) ) +
  stat_density_2d(aes(fill = ..level..), geom = "polygon", colour="white")

p <- ggplot(data, aes(hour, count, color = temp)) +
  geom_point() +
  theme_bw() +
  # gganimate specific bits:
  labs(title = 'Date: {frame_time}', x = 'hour', y = 'count') +
  transition_time(date) +
  ease_aes('linear') + 
  scale_color_gradient2(midpoint=mean(temp), low="blue", mid="white", high="red")
library(gifski)
## Warning: package 'gifski' was built under R version 4.1.3
animate(p, duration = 5, fps = 10, width = 800, height = 500, renderer = gifski_renderer())

anim_save("gif/test.gif", renderer = gifski_renderer())
p <- ggplot(data, aes(date, count, color = temp)) +
  geom_point() +
  theme_bw() +
  # gganimate specific bits:
  labs(title = 'Time: {frame_time}', x = 'date', y = 'count') +
  transition_time(date) +
  ease_aes('linear') +
  scale_color_gradient2(midpoint=mean(temp), low="blue", mid="white", high="red")
library(gifski)
 
animate(p, duration = 5, fps = 20, width = 600, height = 200, renderer = gifski_renderer())

anim_save("gif/test1.gif", renderer = gifski_renderer())